READ - I've created a python script to allow the user to click the button and then they can either have all of the underyling code shown, OR they can just look at the raw output (charts, plots, whatever).
As you know, sometimes these notebooks contain a fair amount of code... and sometimes folks just want the results... here is an example
Code defaults to NOT showing any code, so click the toggle button to view the show the underlying code...
# %load toggle.py
# allows code to be hidden, unhidden on jupyter notebook
from IPython.display import HTML
HTML('''<script>
function code_toggle() {
if (code_shown){
$('div.input').hide('500');
$('#toggleButton').val('Show Code')
} else {
$('div.input').show('500');
$('#toggleButton').val('Hide All Notebook Code')
}
code_shown = !code_shown
}
$( document ).ready(function(){
code_shown=false;
$('div.input').hide()
});</script><form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show All Notebook Code"></form>''')
import warnings
warnings.filterwarnings('ignore')
import os
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs
from plotly.offline import init_notebook_mode
from plotly.offline import plot, iplot
import plotly
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import plotly.io as pio
# sns.set_style("white")
%cd D:\CRITICAL_MAIN_DATAFILE__MILESTONE_II\AoT_Chicago.complete.2021-09-12
D:\CRITICAL_MAIN_DATAFILE__MILESTONE_II\AoT_Chicago.complete.2021-09-12
df = pd.read_csv('use_for_UML.csv', index_col = ['timestamp']) # 1.8M rows
I will keep the seperate timestamp col T for my plots
df.head()
df.tail()
| sensor | value_hrf | T | |
|---|---|---|---|
| timestamp | |||
| 2018-05-01 00:00:18 | bmp180 | 30.2 | 2018-05-01 00:00:18 |
| 2018-05-01 00:00:44 | bmp180 | 30.3 | 2018-05-01 00:00:44 |
| 2018-05-01 00:01:10 | bmp180 | 30.4 | 2018-05-01 00:01:10 |
| 2018-05-01 00:01:35 | bmp180 | 30.3 | 2018-05-01 00:01:35 |
| 2018-05-01 00:02:01 | bmp180 | 30.4 | 2018-05-01 00:02:01 |
| sensor | value_hrf | T | |
|---|---|---|---|
| timestamp | |||
| 2019-10-31 23:57:55 | bmp180 | 2.5 | 2019-10-31 23:57:55 |
| 2019-10-31 23:58:20 | bmp180 | 2.5 | 2019-10-31 23:58:20 |
| 2019-10-31 23:58:45 | bmp180 | 2.5 | 2019-10-31 23:58:45 |
| 2019-10-31 23:59:10 | bmp180 | 2.6 | 2019-10-31 23:59:10 |
| 2019-10-31 23:59:35 | bmp180 | 2.6 | 2019-10-31 23:59:35 |
df.drop('sensor', axis=1, inplace=True)
df['T'] = df['T'].astype('datetime64[ns]')
df.dtypes
value_hrf float64 T datetime64[ns] dtype: object
# # Using graph_objects
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly
# import matplotlib.pyplot as plt
# from matplotlib import pyplot
# import plotly.graph_objs as go
# init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# fig = go.Figure(data=[go.Scatter(x=df.index, y=df.value_hrf)])
# iplot(fig);
# tdf = df.loc['2018-09-01':'2018-10-01'].copy()
tdf = df.loc['2018-07-01':'2019-08-31'].copy()
del df # to be safe
len(tdf) # 1 439 432, 1.4M rows from single sensor
1439432
# from sklearn.ensemble import IsolationForest
# clf = IsolationForest(n_estimators=100,
# max_samples='auto',
# contamination=float(.01),
# max_features=1.0,
# bootstrap=False,
# n_jobs=-1,
# random_state=42,
# verbose=1)
# clf.fit(tdf[['value_hrf']]) # since 1D !
# # The predict function classifies the data as anomalies
# # based on the results from decision function on crossing a threshold
# tdf['scores']=clf.decision_function(tdf[['value_hrf']])
# #tdf['anomaly']=clf.predict(tdf[['value_hrf']])
# #tdf.loc[tdf['anomaly'] == 1,'anomaly'] = 0
# #tdf.loc[tdf['anomaly'] == -1,'anomaly'] = 1
# ## classified as -1 is 'anomalous'
# #tdf.anomaly.value_counts()
# pred = clf.predict(tdf[['value_hrf']])
# tdf['anomaly']=pred
# outliers=tdf.loc[tdf['anomaly']==-1]
# outlier_index=list(outliers.index)
# #Find the number of anomalies and normal points here points classified -1 are anomalous
# print(tdf['anomaly'].value_counts())
# IsolationForest(contamination=0.01, n_jobs=-1, random_state=42, verbose=1)
# 1 86706
# -1 857
# Name: anomaly, dtype: int64
# print("Percentage of anomalies in data: {:.2f}".format((len(tdf.loc[tdf['anomaly']==-1])/len(tdf))*100))
# import seaborn as sns
# import matplotlib.pyplot as plt
# %matplotlib inline
# # sns.set(style="darkgrid")
# fig, ax = plt.subplots(figsize=(15,10))
# sns.histplot(data=tdf, x="scores")
# plt.show();
This code will plot the entire dataset:
# helpful:
# import plotly.graph_objects as go
# help(go.Figure.write_html)
fig = go.Figure(data=[go.Scatter(x=tdf.index,
y=tdf.value_hrf)])
fig = fig.update_layout(
title="Raw Temperature Data",
xaxis_title="timerange of recordings",
yaxis_title="Temperature in Celsius",
template = 'plotly_white',
font=dict(size=12,color="RebeccaPurple" ))
# works:
# --- commenting out for now ---
iplot(fig)
# --- if you want to export the plot to html for further study ---
# this code works for exporting:
# fig.write_html("D:\\GITHUB_Repos\\SensorAnalysis\\ENTER\\results\\iso_forest_base_data_plotted.html")